Part 1- Supervised Learning¶

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')
import time
from sklearn.preprocessing import StandardScaler
pd.options.display.max_columns = None
pd.options.display.max_rows = 80
%matplotlib inline
In [2]:
data = pd.read_csv('bank.csv', sep=';')
In [3]:
data
Out[3]:
age job marital education default housing loan contact month day_of_week duration campaign pdays previous poutcome emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed y
0 56 housemaid married basic.4y no no no telephone may mon 261 1 999 0 nonexistent 1.1 93.994 -36.4 4.857 5191.0 no
1 57 services married high.school unknown no no telephone may mon 149 1 999 0 nonexistent 1.1 93.994 -36.4 4.857 5191.0 no
2 37 services married high.school no yes no telephone may mon 226 1 999 0 nonexistent 1.1 93.994 -36.4 4.857 5191.0 no
3 40 admin. married basic.6y no no no telephone may mon 151 1 999 0 nonexistent 1.1 93.994 -36.4 4.857 5191.0 no
4 56 services married high.school no no yes telephone may mon 307 1 999 0 nonexistent 1.1 93.994 -36.4 4.857 5191.0 no
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
41183 73 retired married professional.course no yes no cellular nov fri 334 1 999 0 nonexistent -1.1 94.767 -50.8 1.028 4963.6 yes
41184 46 blue-collar married professional.course no no no cellular nov fri 383 1 999 0 nonexistent -1.1 94.767 -50.8 1.028 4963.6 no
41185 56 retired married university.degree no yes no cellular nov fri 189 2 999 0 nonexistent -1.1 94.767 -50.8 1.028 4963.6 no
41186 44 technician married professional.course no no no cellular nov fri 442 1 999 0 nonexistent -1.1 94.767 -50.8 1.028 4963.6 yes
41187 74 retired married professional.course no yes no cellular nov fri 239 3 999 1 failure -1.1 94.767 -50.8 1.028 4963.6 no

41188 rows × 21 columns

In [4]:
data.shape
Out[4]:
(41188, 21)
In [5]:
data.isnull().sum()
Out[5]:
age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64
In [6]:
data.columns
Out[6]:
Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

Cleaning Row Names¶

In [7]:
# columns = 

Q1 -- EDA - Analysis¶

In [8]:
data_corr = data.corr()
fig, ax = plt.subplots(figsize=(14,10))
sns.heatmap(data_corr, annot=True, fmt='0.2f', linewidths=0.5, cmap='Greens')
plt.title('Correlation Matrix')
plt.show
Out[8]:
<function matplotlib.pyplot.show(close=None, block=None)>
No description has been provided for this image

Conclusions:¶

  • Highest Direct Relation: euribor3, & emp.var.rate: 0.97, euribor3m & nr.employed: 0.95, nr.employed & emp.var.rate: 0.91.
  • Highest Inverse Relation: previous & pdays: -0.59, nr.employed & previous: -0.50, euribor3m & previous: -.45.
In [9]:
plt.figure(figsize=(10,5))
plt.xticks(rotation=45)
sns.barplot(x='job', data=data, y='age', palette='rainbow')
plt.title('Jobs Vs Age', fontsize='15')
plt.show
Out[9]:
<function matplotlib.pyplot.show(close=None, block=None)>
No description has been provided for this image

Q2 -- Perform the following pre-processing tasks:¶

a. Missing Value Analysis¶

In [10]:
data.isnull().sum()
Out[10]:
age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64
In [11]:
data.columns.unique()
Out[11]:
Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')
In [12]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null  float64
 18  euribor3m       41188 non-null  float64
 19  nr.employed     41188 non-null  float64
 20  y               41188 non-null  object 
dtypes: float64(5), int64(5), object(11)
memory usage: 6.6+ MB

There are no null values in the data set¶

b. Label Encoding wherever required¶

In [13]:
object_cols = data.select_dtypes(include='object')
In [14]:
for col in data.select_dtypes(include=['object']):
    print(col, '=' , data[col].unique(), '\n')
job = ['housemaid' 'services' 'admin.' 'blue-collar' 'technician' 'retired'
 'management' 'unemployed' 'self-employed' 'unknown' 'entrepreneur'
 'student'] 

marital = ['married' 'single' 'divorced' 'unknown'] 

education = ['basic.4y' 'high.school' 'basic.6y' 'basic.9y' 'professional.course'
 'unknown' 'university.degree' 'illiterate'] 

default = ['no' 'unknown' 'yes'] 

housing = ['no' 'yes' 'unknown'] 

loan = ['no' 'yes' 'unknown'] 

contact = ['telephone' 'cellular'] 

month = ['may' 'jun' 'jul' 'aug' 'oct' 'nov' 'dec' 'mar' 'apr' 'sep'] 

day_of_week = ['mon' 'tue' 'wed' 'thu' 'fri'] 

poutcome = ['nonexistent' 'failure' 'success'] 

y = ['no' 'yes'] 

There are 10 Features and Target to which Label Encoding needs to be applied¶

In [15]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
In [16]:
data1 = data.copy()
In [17]:
for col in object_cols:
    data1[col] = le.fit_transform(data1[col])
In [18]:
data1.head()
Out[18]:
age job marital education default housing loan contact month day_of_week duration campaign pdays previous poutcome emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed y
0 56 3 1 0 0 0 0 1 6 1 261 1 999 0 1 1.1 93.994 -36.4 4.857 5191.0 0
1 57 7 1 3 1 0 0 1 6 1 149 1 999 0 1 1.1 93.994 -36.4 4.857 5191.0 0
2 37 7 1 3 0 2 0 1 6 1 226 1 999 0 1 1.1 93.994 -36.4 4.857 5191.0 0
3 40 0 1 1 0 0 0 1 6 1 151 1 999 0 1 1.1 93.994 -36.4 4.857 5191.0 0
4 56 7 1 3 0 0 2 1 6 1 307 1 999 0 1 1.1 93.994 -36.4 4.857 5191.0 0

Label Encoding Done ✅¶

In [19]:
target = data1['y']
X = data1.drop(columns=['y'])

c. Selecting important features based on Random Forest¶

In [20]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X,target)
Out[20]:
RandomForestClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier()
In [21]:
sorted_idx=rfc.feature_importances_.argsort()[::-1]
plt.figure(figsize=(10,7))
y_values = list(X.columns[sorted_idx])
sns.barplot(x=rfc.feature_importances_[sorted_idx], y=y_values, palette="rainbow")
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()
No description has been provided for this image

e. Standardize the data using any one of the scalers provided by sklearn¶

In [22]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
In [23]:
X_scaled = pd.DataFrame(sc.fit_transform(X), columns=X.columns)
In [24]:
X_scaled
Out[24]:
age job marital education default housing loan contact month day_of_week duration campaign pdays previous poutcome emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed
0 1.533034 -0.201579 -0.283741 -1.753925 -0.513600 -1.087707 -0.452491 1.31827 0.762558 -0.718834 0.010471 -0.565922 0.195414 -0.349494 0.192622 0.648092 0.722722 0.886447 0.712460 0.331680
1 1.628993 0.911227 -0.283741 -0.349730 1.945327 -1.087707 -0.452491 1.31827 0.762558 -0.718834 -0.421501 -0.565922 0.195414 -0.349494 0.192622 0.648092 0.722722 0.886447 0.712460 0.331680
2 -0.290186 0.911227 -0.283741 -0.349730 -0.513600 0.942127 -0.452491 1.31827 0.762558 -0.718834 -0.124520 -0.565922 0.195414 -0.349494 0.192622 0.648092 0.722722 0.886447 0.712460 0.331680
3 -0.002309 -1.036184 -0.283741 -1.285860 -0.513600 -1.087707 -0.452491 1.31827 0.762558 -0.718834 -0.413787 -0.565922 0.195414 -0.349494 0.192622 0.648092 0.722722 0.886447 0.712460 0.331680
4 1.533034 0.911227 -0.283741 -0.349730 -0.513600 -1.087707 2.311440 1.31827 0.762558 -0.718834 0.187888 -0.565922 0.195414 -0.349494 0.192622 0.648092 0.722722 0.886447 0.712460 0.331680
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
41183 3.164336 0.354824 -0.283741 0.586399 -0.513600 0.942127 -0.452491 -0.75857 1.193593 -1.434368 0.292025 -0.565922 0.195414 -0.349494 0.192622 -0.752343 2.058168 -2.224953 -1.495186 -2.815697
41184 0.573445 -0.757982 -0.283741 0.586399 -0.513600 -1.087707 -0.452491 -0.75857 1.193593 -1.434368 0.481012 -0.565922 0.195414 -0.349494 0.192622 -0.752343 2.058168 -2.224953 -1.495186 -2.815697
41185 1.533034 0.354824 -0.283741 1.054464 -0.513600 0.942127 -0.452491 -0.75857 1.193593 -1.434368 -0.267225 -0.204909 0.195414 -0.349494 0.192622 -0.752343 2.058168 -2.224953 -1.495186 -2.815697
41186 0.381527 1.467630 -0.283741 0.586399 -0.513600 -1.087707 -0.452491 -0.75857 1.193593 -1.434368 0.708569 -0.565922 0.195414 -0.349494 0.192622 -0.752343 2.058168 -2.224953 -1.495186 -2.815697
41187 3.260295 0.354824 -0.283741 0.586399 -0.513600 0.942127 -0.452491 -0.75857 1.193593 -1.434368 -0.074380 0.156105 0.195414 1.671136 -2.563098 -0.752343 2.058168 -2.224953 -1.495186 -2.815697

41188 rows × 20 columns

In [25]:
target
Out[25]:
0        0
1        0
2        0
3        0
4        0
        ..
41183    1
41184    0
41185    0
41186    1
41187    0
Name: y, Length: 41188, dtype: int32
In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, target, test_size=0.2, random_state=42)
In [27]:
rfc2 = RandomForestClassifier()
rfc2.fit(X_train, y_train)
Out[27]:
RandomForestClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier()
In [28]:
sorted_idx=rfc2.feature_importances_.argsort()[::-1]
plt.figure(figsize=(10,7))
y_values = list(X_scaled.columns[sorted_idx])
sns.barplot(x=rfc2.feature_importances_[sorted_idx], y=y_values, palette="rainbow")
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()
rfc2.feature_importances_
No description has been provided for this image
Out[28]:
array([0.09116632, 0.0485781 , 0.02355482, 0.04346493, 0.00895397,
       0.02018895, 0.01539676, 0.01034687, 0.01891394, 0.04156039,
       0.31438321, 0.04238978, 0.03515509, 0.01317983, 0.02577809,
       0.02439972, 0.02175246, 0.02815069, 0.10868429, 0.06400179])

Q.3 & Q.4 Build the following Supervised Learning models:¶

a. Logistic Regression¶

In [29]:
from sklearn.linear_model import LogisticRegression
lc = LogisticRegression(random_state=0, max_iter=10000)
lc.fit(X_train,y_train)
Out[29]:
LogisticRegression(max_iter=10000, random_state=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(max_iter=10000, random_state=0)
In [30]:
from sklearn.metrics import confusion_matrix,f1_score,accuracy_score
y_pred = lc.predict(X_test)
print(confusion_matrix(y_test,y_pred))
[[7108  195]
 [ 542  393]]
In [31]:
acc = accuracy_score(y_test,y_pred) * 100
print("Accuracy score is {}%".format(round(acc,2)))
Accuracy score is 91.05%

Applying Cross Validation Score¶

In [32]:
from sklearn.model_selection import cross_val_score
np.mean(cross_val_score(LogisticRegression(),X_scaled,target,cv=10,scoring='accuracy'))
Out[32]:
0.8317141827617217

b. Decision Trees¶

In [33]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, target, test_size=0.2, random_state=42)
In [35]:
dtc.fit(X_train, y_train)
Out[35]:
DecisionTreeClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier()
In [36]:
y_pred = dtc.predict(X_test)
In [37]:
from sklearn.metrics import accuracy_score,confusion_matrix
accuracy_score(y_test, y_pred)
Out[37]:
0.8885651857246905
In [38]:
confusion_matrix(y_test,y_pred)
Out[38]:
array([[6843,  460],
       [ 458,  477]], dtype=int64)

c. Random Forest¶

In [39]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_features=7, max_depth=8, n_jobs=-1)
In [40]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, target, test_size=0.2, random_state=0)
In [41]:
rfc.fit(X_train, y_train)
Out[41]:
RandomForestClassifier(max_depth=8, max_features=7, n_jobs=-1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(max_depth=8, max_features=7, n_jobs=-1)
In [42]:
from sklearn.metrics import accuracy_score
y_pred = rfc.predict(X_test)
accuracy_score(y_test, y_pred)
Out[42]:
0.9217042971595047

From above analysis We conclude that:¶

Logistic Regression - Accuracy: 83.17% After CV.¶

Decision Tree - Accuracy: 88.97%.¶

Random Forest Classifier - Accuracy: 92.17% After Tuning Hyperparameters.¶

Part 2 – Unsupervised Learning¶

In [43]:
credit = pd.read_csv('credit_card.csv')
In [44]:
credit.head()
Out[44]:
CUST_ID BALANCE BALANCE_FREQUENCY PURCHASES ONEOFF_PURCHASES INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT PAYMENTS MINIMUM_PAYMENTS PRC_FULL_PAYMENT TENURE
0 C10001 40.900749 0.818182 95.40 0.00 95.4 0.000000 0.166667 0.000000 0.083333 0.000000 0 2 1000.0 201.802084 139.509787 0.000000 12
1 C10002 3202.467416 0.909091 0.00 0.00 0.0 6442.945483 0.000000 0.000000 0.000000 0.250000 4 0 7000.0 4103.032597 1072.340217 0.222222 12
2 C10003 2495.148862 1.000000 773.17 773.17 0.0 0.000000 1.000000 1.000000 0.000000 0.000000 0 12 7500.0 622.066742 627.284787 0.000000 12
3 C10004 1666.670542 0.636364 1499.00 1499.00 0.0 205.788017 0.083333 0.083333 0.000000 0.083333 1 1 7500.0 0.000000 NaN 0.000000 12
4 C10005 817.714335 1.000000 16.00 16.00 0.0 0.000000 0.083333 0.083333 0.000000 0.000000 0 1 1200.0 678.334763 244.791237 0.000000 12
In [45]:
credit.shape
Out[45]:
(8950, 18)

Q1. Perform EDA on the given data. What does the primary analysis of several Numeric features reveal?¶

In [46]:
credit_corr = credit.corr()
fig, ax = plt.subplots(figsize=(24,20))
sns.heatmap(credit_corr, annot=True, fmt='0.2f', linewidths=0.5, cmap='Blues')
plt.title('Correlation Matrix')
plt.show
Out[46]:
<function matplotlib.pyplot.show(close=None, block=None)>
No description has been provided for this image

Conclusions:¶

  • Highest Direct Relation: ONEOFF_PURCHASES, & :PURCHASES 0.92, PURCHASES_TRX & PURCHASES_TRX : 0.69
In [47]:
credit.columns
Out[47]:
Index(['CUST_ID', 'BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES',
       'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE',
       'PURCHASES_FREQUENCY', 'ONEOFF_PURCHASES_FREQUENCY',
       'PURCHASES_INSTALLMENTS_FREQUENCY', 'CASH_ADVANCE_FREQUENCY',
       'CASH_ADVANCE_TRX', 'PURCHASES_TRX', 'CREDIT_LIMIT', 'PAYMENTS',
       'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT', 'TENURE'],
      dtype='object')
In [48]:
credit.select_dtypes(include='object')
credit = credit.drop(columns='CUST_ID')
In [49]:
sns.pairplot(data=credit)
plt.show()
No description has been provided for this image
No description has been provided for this image

Q2. Perform the following Exploratory Data Analysis tasks:¶

a. Missing Value Analysis¶

In [50]:
credit.isna().sum()
Out[50]:
BALANCE                               0
BALANCE_FREQUENCY                     0
PURCHASES                             0
ONEOFF_PURCHASES                      0
INSTALLMENTS_PURCHASES                0
CASH_ADVANCE                          0
PURCHASES_FREQUENCY                   0
ONEOFF_PURCHASES_FREQUENCY            0
PURCHASES_INSTALLMENTS_FREQUENCY      0
CASH_ADVANCE_FREQUENCY                0
CASH_ADVANCE_TRX                      0
PURCHASES_TRX                         0
CREDIT_LIMIT                          1
PAYMENTS                              0
MINIMUM_PAYMENTS                    313
PRC_FULL_PAYMENT                      0
TENURE                                0
dtype: int64
In [51]:
credit['MINIMUM_PAYMENTS'].value_counts(dropna=False)
Out[51]:
NaN            313
299.351881       2
150.317143       1
271.528169       1
6404.855484      1
              ... 
181.773223       1
711.894455       1
256.522546       1
127.799107       1
88.288956        1
Name: MINIMUM_PAYMENTS, Length: 8637, dtype: int64
In [52]:
credit.describe()
Out[52]:
BALANCE BALANCE_FREQUENCY PURCHASES ONEOFF_PURCHASES INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT PAYMENTS MINIMUM_PAYMENTS PRC_FULL_PAYMENT TENURE
count 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8950.000000 8949.000000 8950.000000 8637.000000 8950.000000 8950.000000
mean 1564.474828 0.877271 1003.204834 592.437371 411.067645 978.871112 0.490351 0.202458 0.364437 0.135144 3.248827 14.709832 4494.449450 1733.143852 864.206542 0.153715 11.517318
std 2081.531879 0.236904 2136.634782 1659.887917 904.338115 2097.163877 0.401371 0.298336 0.397448 0.200121 6.824647 24.857649 3638.815725 2895.063757 2372.446607 0.292499 1.338331
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 50.000000 0.000000 0.019163 0.000000 6.000000
25% 128.281915 0.888889 39.635000 0.000000 0.000000 0.000000 0.083333 0.000000 0.000000 0.000000 0.000000 1.000000 1600.000000 383.276166 169.123707 0.000000 12.000000
50% 873.385231 1.000000 361.280000 38.000000 89.000000 0.000000 0.500000 0.083333 0.166667 0.000000 0.000000 7.000000 3000.000000 856.901546 312.343947 0.000000 12.000000
75% 2054.140036 1.000000 1110.130000 577.405000 468.637500 1113.821139 0.916667 0.300000 0.750000 0.222222 4.000000 17.000000 6500.000000 1901.134317 825.485459 0.142857 12.000000
max 19043.138560 1.000000 49039.570000 40761.250000 22500.000000 47137.211760 1.000000 1.000000 1.000000 1.500000 123.000000 358.000000 30000.000000 50721.483360 76406.207520 1.000000 12.000000

There are 2 Parameter which has NaN values.¶

In [53]:
credit['MINIMUM_PAYMENTS'] = credit['MINIMUM_PAYMENTS'].fillna(credit['MINIMUM_PAYMENTS'].mean())
In [54]:
credit['MINIMUM_PAYMENTS'].isna().sum()
Out[54]:
0

Dropping 1 Column of Credit¶

In [55]:
credit = credit.dropna()
In [56]:
credit.isna().sum()
Out[56]:
BALANCE                             0
BALANCE_FREQUENCY                   0
PURCHASES                           0
ONEOFF_PURCHASES                    0
INSTALLMENTS_PURCHASES              0
CASH_ADVANCE                        0
PURCHASES_FREQUENCY                 0
ONEOFF_PURCHASES_FREQUENCY          0
PURCHASES_INSTALLMENTS_FREQUENCY    0
CASH_ADVANCE_FREQUENCY              0
CASH_ADVANCE_TRX                    0
PURCHASES_TRX                       0
CREDIT_LIMIT                        0
PAYMENTS                            0
MINIMUM_PAYMENTS                    0
PRC_FULL_PAYMENT                    0
TENURE                              0
dtype: int64

b. Outlier Treatment using the Z-score method¶

In [57]:
credit.describe()
Out[57]:
BALANCE BALANCE_FREQUENCY PURCHASES ONEOFF_PURCHASES INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT PAYMENTS MINIMUM_PAYMENTS PRC_FULL_PAYMENT TENURE
count 8949.000000 8949.000000 8949.000000 8949.000000 8949.000000 8949.000000 8949.000000 8949.000000 8949.000000 8949.000000 8949.000000 8949.000000 8949.000000 8949.000000 8949.000000 8949.000000 8949.000000
mean 1564.647593 0.877350 1003.316936 592.503572 411.113579 978.959616 0.490405 0.202480 0.364478 0.135141 3.249078 14.711476 4494.449450 1733.336511 864.301501 0.153732 11.517935
std 2081.584016 0.236798 2136.727848 1659.968851 904.378205 2097.264344 0.401360 0.298345 0.397451 0.200132 6.824987 24.858552 3638.815725 2895.168146 2330.700932 0.292511 1.337134
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 50.000000 0.000000 0.019163 0.000000 6.000000
25% 128.365782 0.888889 39.800000 0.000000 0.000000 0.000000 0.083333 0.000000 0.000000 0.000000 0.000000 1.000000 1600.000000 383.282850 170.875613 0.000000 12.000000
50% 873.680279 1.000000 361.490000 38.000000 89.000000 0.000000 0.500000 0.083333 0.166667 0.000000 0.000000 7.000000 3000.000000 857.062706 335.657631 0.000000 12.000000
75% 2054.372848 1.000000 1110.170000 577.830000 468.650000 1113.868654 0.916667 0.300000 0.750000 0.222222 4.000000 17.000000 6500.000000 1901.279320 864.206542 0.142857 12.000000
max 19043.138560 1.000000 49039.570000 40761.250000 22500.000000 47137.211760 1.000000 1.000000 1.000000 1.500000 123.000000 358.000000 30000.000000 50721.483360 76406.207520 1.000000 12.000000
In [58]:
from scipy.stats import zscore
upd_credit = credit[(np.abs(zscore(credit)) < 3).all(axis=1)]
In [59]:
upd_credit.head()
Out[59]:
BALANCE BALANCE_FREQUENCY PURCHASES ONEOFF_PURCHASES INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT PAYMENTS MINIMUM_PAYMENTS PRC_FULL_PAYMENT TENURE
0 40.900749 0.818182 95.40 0.00 95.4 0.000000 0.166667 0.000000 0.083333 0.000000 0 2 1000.0 201.802084 139.509787 0.000000 12
1 3202.467416 0.909091 0.00 0.00 0.0 6442.945483 0.000000 0.000000 0.000000 0.250000 4 0 7000.0 4103.032597 1072.340217 0.222222 12
2 2495.148862 1.000000 773.17 773.17 0.0 0.000000 1.000000 1.000000 0.000000 0.000000 0 12 7500.0 622.066742 627.284787 0.000000 12
3 1666.670542 0.636364 1499.00 1499.00 0.0 205.788017 0.083333 0.083333 0.000000 0.083333 1 1 7500.0 0.000000 864.206542 0.000000 12
4 817.714335 1.000000 16.00 16.00 0.0 0.000000 0.083333 0.083333 0.000000 0.000000 0 1 1200.0 678.334763 244.791237 0.000000 12

c. Deal with correlated variables.¶

In [60]:
credit_new_corr = credit.corr()
fig, ax = plt.subplots(figsize=(24,20))
sns.heatmap(credit_new_corr, annot=True, fmt='0.2f', linewidths=0.5, cmap='Blues')
plt.title('Correlation Matrix')
plt.show()
No description has been provided for this image

Q3. Perform dimensionality reduction using PCA such that the 95% of the variance is explained¶

In [61]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
final_data = pd.DataFrame(sc.fit_transform(upd_credit), columns=upd_credit.columns)
In [62]:
final_data.head()
Out[62]:
BALANCE BALANCE_FREQUENCY PURCHASES ONEOFF_PURCHASES INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT PAYMENTS MINIMUM_PAYMENTS PRC_FULL_PAYMENT TENURE
0 -0.809652 -0.334911 -0.641448 -0.543505 -0.448067 -0.543984 -0.805098 -0.668791 -0.698321 -0.690806 -0.587319 -0.665235 -1.021765 -0.762853 -0.577804 -0.531523 0.312187
1 1.273272 0.091127 -0.734487 -0.543505 -0.636612 4.671914 -1.225631 -0.668791 -0.910579 0.883825 0.427706 -0.797517 1.022039 1.953121 0.623747 0.232637 0.312187
2 0.807271 0.517164 0.019551 0.430864 -0.636612 -0.543984 1.297564 2.808268 -0.910579 -0.690806 -0.587319 -0.003826 1.192356 -0.470272 0.050485 -0.531523 0.312187
3 0.261448 -1.186986 0.727420 1.345575 -0.636612 -0.377388 -1.015366 -0.379037 -0.910579 -0.165931 -0.333563 -0.731376 1.192356 -0.903344 0.355657 -0.531523 0.312187
4 -0.297867 0.517164 -0.718883 -0.523341 -0.636612 -0.543984 -1.015366 -0.379037 -0.910579 -0.690806 -0.587319 -0.731376 -0.953638 -0.431099 -0.442194 -0.531523 0.312187

Transformed data before applying PCA¶

In [63]:
from sklearn.decomposition import PCA
pca = PCA()
pca_data = pca.fit_transform(final_data)
In [64]:
explained_variance = pca.explained_variance_ratio_
explained_variance
Out[64]:
array([2.89681469e-01, 2.03617595e-01, 9.53162936e-02, 7.70335753e-02,
       6.31629683e-02, 5.38697893e-02, 4.67017544e-02, 3.81408019e-02,
       3.45633100e-02, 2.83042968e-02, 1.81173944e-02, 1.72817431e-02,
       1.41427165e-02, 1.14430418e-02, 6.35293736e-03, 2.26833379e-03,
       1.98008996e-06])
In [65]:
np.cumsum((pca.explained_variance_ratio_))
Out[65]:
array([0.28968147, 0.49329906, 0.58861536, 0.66564893, 0.7288119 ,
       0.78268169, 0.82938344, 0.86752425, 0.90208756, 0.93039185,
       0.94850925, 0.96579099, 0.97993371, 0.99137675, 0.99772969,
       0.99999802, 1.        ])

We need to explain Varaince upto 95%, Hence we select 12 Components¶

Total of 12 Components = 96.65%¶

In [66]:
pca_2 = PCA(n_components = 12)
final_df = pca_2.fit_transform(final_data)
In [67]:
final_df
Out[67]:
array([[-1.32562517, -2.04662581,  0.20416784, ..., -0.1254455 ,
         0.03903563, -0.28149219],
       [-2.78107354,  3.31127858,  0.44440901, ...,  1.49002356,
         1.29672022, -1.88447238],
       [ 1.23742926,  0.54476019,  1.47222511, ...,  0.99620197,
        -0.77998389, -0.45164322],
       ...,
       [-2.57473066, -1.14053369,  0.46583268, ..., -0.1672788 ,
        -0.21099164, -0.28364622],
       [-0.11663847, -1.95458381, -1.11373905, ...,  0.24033441,
        -0.12742849,  0.26632671],
       [-2.51578369, -0.8759746 ,  0.50195631, ..., -0.39762079,
        -0.82680118,  0.38732394]])
In [68]:
final_df = pd.DataFrame(final_df)
In [69]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
start = time.time()
wcss = []
for i in range(1, 11):
    kmean = KMeans(n_clusters = i, random_state = 42)
    kmean.fit(final_df)
    wcss.append(kmean.inertia_)
plt.plot(range(1, 11), wcss, 'rx-')
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
print(time.time() - start)
No description has been provided for this image
11.518290519714355
In [70]:
start = time.time()
silhouette_avg = []
for i in range(2, 11):
    kmean = KMeans(n_clusters = i, random_state = 42)
    kmean.fit(final_df)
    silhouette_avg.append(silhouette_score(final_df, kmean.labels_))
plt.plot(range(2, 11), silhouette_avg, 'bx-')
plt.title('Silhouette Analysis')
plt.xlabel('Values of K')
plt.ylabel('Silhouette Score')
plt.show()
print(time.time() - start)
No description has been provided for this image
17.55561375617981

There are 3 clusters¶

In [71]:
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(final_df)
In [72]:
final_df['clusters'] = clusters
In [73]:
final_df['clusters'].value_counts()
Out[73]:
1    4177
2    1791
0    1466
Name: clusters, dtype: int64